{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Testing File Filtering in LongContextRAG\n",
    "\n",
    "This notebook demonstrates how to test the file filtering functionality in the LongContextRAG class."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/miniconda3/envs/byzer-test/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n",
      "2024-08-26 18:19:38,886\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n",
      "\u001b[32m2024-08-26 18:19:41.311\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mbyzerllm.utils.connect_ray\u001b[0m:\u001b[36mconnect_cluster\u001b[0m:\u001b[36m48\u001b[0m - \u001b[1mJDK 21 will be used (/Users/allwefantasy/.auto-coder/jdk-21.0.2.jdk/Contents/Home)...\u001b[0m\n",
      "2024-08-26 18:19:41,349\tINFO worker.py:1596 -- Connecting to existing Ray cluster at address: 127.0.0.1:6379...\n",
      "2024-08-26 18:19:41,366\tINFO worker.py:1772 -- Connected to Ray cluster. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n",
      "\u001b[32m2024-08-26 18:19:41.397\u001b[0m | \u001b[31m\u001b[1mERROR   \u001b[0m | \u001b[36mautocoder.rag.long_context_rag\u001b[0m:\u001b[36m_retrieve_documents\u001b[0m:\u001b[36m134\u001b[0m - \u001b[31m\u001b[1mError processing file /var/folders/dm/0xljd5nn10b7bwwmwv8w5v100000gn/T/tmpkwnpvtph/test4.docx: \"There is no item named 'word/document.xml' in the archive\"\u001b[0m\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Retrieved documents:\n",
      "- .serveignore\n",
      "- test3.pdf\n",
      "- test1.md\n",
      "\n",
      "Content of test1.md:\n",
      "# Test 1\n",
      "This is a markdown file.\n",
      "\n",
      "Testing ignore functionality:\n",
      "Is 'test2.txt' ignored: True\n",
      "Is 'test5.py' ignored: True\n",
      "Is 'test1.md' ignored: False\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import tempfile\n",
    "from autocoder.rag.long_context_rag import LongContextRAG\n",
    "from autocoder.common import AutoCoderArgs\n",
    "from byzerllm import ByzerLLM\n",
    "\n",
    "# Create a temporary directory for testing\n",
    "with tempfile.TemporaryDirectory() as temp_dir:\n",
    "    # Create some test files\n",
    "    files = [\n",
    "        ('test1.md', '# Test 1\\nThis is a markdown file.'),\n",
    "        ('test2.txt', 'This is a text file.'),\n",
    "        ('test3.pdf', b'%PDF-1.4\\n%\\xe2\\xe3\\xcf\\xd3\\n1 0 obj\\n<</Type/Catalog/Pages 2 0 R>>\\nendobj\\n2 0 obj\\n<</Type/Pages/Count 1/Kids[3 0 R]>>\\nendobj\\n3 0 obj\\n<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R/Resources<<>>>>\\nendobj\\nxref\\n0 4\\n0000000000 65535 f\\n0000000015 00000 n\\n0000000060 00000 n\\n0000000111 00000 n\\n trailer\\n<</Size 4/Root 1 0 R>>\\nstartxref\\n190\\n%%EOF\\n'),\n",
    "        ('test4.docx', b'PK\\x03\\x04\\x14\\x00\\x06\\x00\\x08\\x00\\x00\\x00!\\x00\\xf3S\\xc5\\xdd\\xd5\\x01\\x00\\x00\\xe2\\x05\\x00\\x00\\x13\\x00\\x08\\x02[Content_Types].xml \\xa2\\x04\\x02(\\xa0\\x00\\x02\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00PK\\x01\\x02\\x14\\x00\\x14\\x00\\x06\\x00\\x08\\x00\\x00\\x00!\\x00\\xf3S\\xc5\\xdd\\xd5\\x01\\x00\\x00\\xe2\\x05\\x00\\x00\\x13\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00[Content_Types].xmlPK\\x05\\x06\\x00\\x00\\x00\\x00\\x01\\x00\\x01\\x00A\\x00\\x00\\x00\\x16\\x02\\x00\\x00\\x00\\x00'),\n",
    "        ('test5.py', '# This is a Python file\\nprint(\"Hello, World!\")'),\n",
    "        ('.serveignore', '*.txt\\n*.py')\n",
    "    ]\n",
    "    \n",
    "    for filename, content in files:\n",
    "        with open(os.path.join(temp_dir, filename), 'wb' if isinstance(content, bytes) else 'w') as f:\n",
    "            f.write(content)\n",
    "    \n",
    "    # Initialize LongContextRAG\n",
    "    args = AutoCoderArgs()\n",
    "    # args.required_exts = '.md,.pdf,.docx'\n",
    "    llm = ByzerLLM.from_default_model(\"deepseek_chat\")\n",
    "    rag = LongContextRAG(llm, args, temp_dir)\n",
    "    \n",
    "    # Test file retrieval\n",
    "    documents = rag._retrieve_documents()\n",
    "    \n",
    "    print(\"Retrieved documents:\")\n",
    "    for doc in documents:\n",
    "        print(f\"- {doc.module_name}\")\n",
    "    \n",
    "    print(\"\\nContent of test1.md:\")\n",
    "    print(next(doc.source_code for doc in documents if doc.module_name == 'test1.md'))\n",
    "\n",
    "    # Test ignore functionality\n",
    "    print(\"\\nTesting ignore functionality:\")\n",
    "    print(f\"Is 'test2.txt' ignored: {rag.ignore_spec.match_file('test2.txt')}\")\n",
    "    print(f\"Is 'test5.py' ignored: {rag.ignore_spec.match_file('test5.py')}\")\n",
    "    print(f\"Is 'test1.md' ignored: {rag.ignore_spec.match_file('test1.md')}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Hello', ',', 'Ġhow', 'Ġare', 'Ġyou', '?']"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from tokenizers import Tokenizer\n",
    "tokenizer = Tokenizer.from_file(\"/Users/allwefantasy/Downloads/tokenizer.json\")\n",
    "\n",
    "# 使用加载的 tokenizer\n",
    "encoded = tokenizer.encode(\"Hello, how are you?\")\n",
    "encoded.tokens"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This test notebook does the following:\n",
    "\n",
    "1. Creates a temporary directory with various test files, including a `.serveignore` file.\n",
    "2. Initializes a LongContextRAG instance with specific file extension requirements.\n",
    "3. Retrieves documents using the `_retrieve_documents` method.\n",
    "4. Prints the names of retrieved documents.\n",
    "5. Displays the content of a specific retrieved document.\n",
    "6. Tests the ignore functionality by checking if certain files are ignored based on the `.serveignore` rules.\n",
    "\n",
    "You can run this notebook to verify that the file filtering in LongContextRAG is working as expected, including the handling of required extensions and the `.serveignore` file."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "byzer-test",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
